##### ########################################################## ######
#####                                                            ######
#####   Input: Various dictionary files
#####   Output: Quanteda dictionaries
#####                                                            ######
##### ########################################################## ######

rm(list=ls())

# Load libraries

library(quanteda) # CRAN v3.0.0
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.22
library(data.table) # CRAN 1.13.6
library(text2vec) # CRAN v0.6
library(ukbabynames) #CRAN v0.1.1

sigmoid_new <- function(x,a,c){
  1/(1+exp(-a*(x-c)))
}
trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s)

embedding_dictionary_scores <- function(dictionary_words, word_vectors){
  
  matches <- lapply(dictionary_words, function(x) {
    if(grepl("\\*", x) | x == "^[[:digit:]]"){
      x <- gsub("\\*", "", x)
      grep(paste0("^",x), rownames(word_vectors))
      
    }else{
      which(rownames(word_vectors) == x)
    }
  }
  )  
  
  matches <- unique(unlist(matches))
  
  target_vector <- word_vectors[c(matches),,drop = FALSE ] 
  
  # Create mean word-embedding for dictionary
  target_vector <- t(as.matrix(apply(target_vector, 2, mean))) 
  
  # Distance between each word in the vocabulary and the mean embedding
  cos_sim <- sim2(target_vector, word_vectors, method = "cosine", norm = "l2") 
  
  word_scores <- data.table(word = dimnames(cos_sim)[[2]], 
                            score = cos_sim[1,], 
                            in_original_dictionary = dimnames(cos_sim)[[2]]%in%rownames(word_vectors)[matches])
  
  word_scores <- word_scores[order(word_scores$score, decreasing = T),]
  
  word_scores$sigmoid <- sigmoid_new(word_scores$score, 40, .35)
  
  return(word_scores)
  
}

# Convenience function to take a liwc-style dictionary and return all word matches

expand_dictionary <- function(original_words){
  
  matches <- lapply(original_words, function(x) {
    if(grepl("\\*", x) | x == "^[[:digit:]]"){
      x <- gsub("\\*", "", x)
      grep(paste0("^",x), rownames(word_vectors))
      
    }else{
      which(rownames(word_vectors) == x)
    }
  }
  
  ) 
  
  return(sort(unique(rownames(word_vectors)[unlist(matches)])))
  
}

## Source dictionaries

load("data/dictionaries/liwc.Rdata")

## Load word vectors

load("working/word_vectors_150.Rdata")

## Drop parliamentary jargon vectors

word_vectors <- word_vectors[!grepl("parljargon|parlboilerplate", rownames(word_vectors)),]

## Define seed dictionaries

babynames <- ukbabynames
babynames <- data.table(babynames[babynames$year %in% c(1970:2019),])

girlnames <- unique(babynames[sex == "F", name[order(n, decreasing = T)[1:100]], by = year]$V1)
boynames <- unique(babynames[sex == "M", name[order(n, decreasing = T)[1:100]], by = year]$V1)

babynames <- c(girlnames, boynames)

anecdote_words <- read.csv("data/dictionaries/seed_words_anecdote.csv", stringsAsFactors = F)
anecdote_words <- c(trim(tolower(anecdote_words[,1])), tolower(babynames))

aggression_words <- read.csv("data/dictionaries/LH_aggression_seed.csv", stringsAsFactors = F)
aggression_words <- trim(aggression_words[,1])

affect_words <- liwc$Affect
fact_words <- c(liwc$Number, liwc$Quant, "^[[:digit:]]","000")

posemo_words <- expand_dictionary(data_dictionary_RID$EMOTIONS$POSITIVE_AFFECT)
negemo_words <- expand_dictionary(c(data_dictionary_RID$EMOTIONS$ANXIETY,data_dictionary_RID$EMOTIONS$SADNESS))

dictionary_to_use <- dictionary(list(affect = expand_dictionary(affect_words),
                                     posemo = expand_dictionary(posemo_words),
                                     negemo = expand_dictionary(negemo_words),
                                     fact = expand_dictionary(fact_words),
                                     anecdote = expand_dictionary(anecdote_words),
                                     aggression = expand_dictionary(aggression_words)))

word_scores <- lapply(names(dictionary_to_use), 
                      function(x) 
                        embedding_dictionary_scores(dictionary_to_use[[x]], word_vectors = word_vectors))

names(word_scores) <- names(dictionary_to_use)

save(word_scores, dictionary_to_use, file = "working/dictionaries.Rdata")
